In [ ]:
from google.colab import drive
drive.mount('/content/drive/')
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).

Load required libraries

In [ ]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords # Remove stop words
import string # Remove punctuation
from wordcloud import WordCloud
nltk.download('wordnet')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[ ]:
True

Load data file into a dataframe

In [ ]:
file_path =  '/content/drive/My Drive/input_data.xlsx'
data_df =  pd.read_excel(file_path)
data_df.head()
Out[ ]:
Short description Description Caller Assignment group
0 login issue -verified user details.(employee# & manager na... spxjnwir pjlcoqds GRP_0
1 outlook \r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail... hmjdrvpb komuaywn GRP_0
2 cant log in to vpn \r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail... eylqgodm ybqkwiam GRP_0
3 unable to access hr_tool page unable to access hr_tool page xbkucsvz gcpydteq GRP_0
4 skype error skype error owlgqjme qhcozdfx GRP_0

EDA

Shape of the dataset

In [ ]:
data_df.shape
Out[ ]:
(8500, 4)

Describe the dataset

In [ ]:
data_df.describe()
Out[ ]:
Short description Description Caller Assignment group
count 8492 8499 8500 8500
unique 7481 7817 2950 74
top password reset the bpctwhsn kzqsbmtp GRP_0
freq 38 56 810 3976

Check data-types of columns

In [ ]:
data_df.dtypes
Out[ ]:
Short description    object
Description          object
Caller               object
Assignment group     object
dtype: object

Check for null values and replace them with stop words.

In [ ]:
data_df.isnull().sum()
Out[ ]:
Short description    8
Description          1
Caller               0
Assignment group     0
dtype: int64
In [ ]:
null_indices =  np.where(pd.isnull(data_df))
# replace nulls with stop words
data_df["Short description"].fillna("the", inplace = True)
data_df["Description"].fillna("the", inplace = True)
data_df.isnull().any()
Out[ ]:
Short description    False
Description          False
Caller               False
Assignment group     False
dtype: bool

Check for assignment group wise count of tickets

In [ ]:
group_count = data_df['Assignment group'].value_counts()
sns.barplot(group_count.index[:9], group_count.values[:9], alpha=0.8)
plt.title('Category wise number of tickets')
plt.ylabel('Number of tickets', fontsize=12)
plt.xlabel('Assignment groups', fontsize=12)
plt.show()

46% of tickets are assigned to assignment group "GRP_0".

Check for caller wise count of tickets

In [ ]:
group_count = data_df['Caller'].value_counts()
sns.barplot(group_count.index[:9], group_count.values[:9], alpha=0.8)
plt.title('User wise number of tickets')
plt.ylabel('Number of tickets', fontsize=12)
plt.xlabel('Users', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

The highest number of ticket created by a single user is 810 (approx 9% of all tickets).

Group 0 has a large number of tickets. Here is Group 0 analysis.

In [ ]:
df2 = data_df[data_df['Assignment group']=='GRP_0']
pd.set_option("max_rows", None)
df2.dropna(axis=0,inplace=True)
df2.reset_index(drop=True,inplace=True)

group_type = []
for i in range (len(df2)):
    search_list1 = ['log', 'login', 'account', 'username','password', 'join','id','access','internet']
    search_list2 = ['software', 'server', 'vpn','microsoft','skype','gmail', 'outlook','email']
    if re.compile('|'.join(search_list1),re.IGNORECASE).search(df2['Short description'][i]):
        group_type.append('A')
    elif re.compile('|'.join(search_list2),re.IGNORECASE).search(df2['Short description'][i]):
        group_type.append('B')
    else :
        group_type.append('C')
        
df2['Group_Type'] = group_type

sns.countplot(df2['Group_Type'])
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f95e9c9b3c8>

Find duplicates in data

In [ ]:
data_df[data_df.duplicated()]
Out[ ]:
Short description Description Caller Assignment group
51 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
229 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
493 ticket update on inplant_872730 ticket update on inplant_872730 fumkcsji sarmtlhy GRP_0
512 blank call //gso blank call //gso rbozivdq gmlhrtvp GRP_0
667 job bkbackup_tool_powder_prod_full failed in j... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
724 blank call blank call rbozivdq gmlhrtvp GRP_0
1064 job Job_1967d failed in job_scheduler at: 10/1... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
1125 blank call blank call rbozivdq gmlhrtvp GRP_0
1744 phone issue phone issue gzjtweph mnslwfqv GRP_0
1851 reset passwords for fylrosuk kedgmiul using pa... the fylrosuk kedgmiul GRP_17
1982 call came and got disconnected call came and got disconnected rbozivdq gmlhrtvp GRP_0
2000 job Job_549 failed in job_scheduler at: 10/07/... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
2061 blank call // loud noise // gso blank call // loud noise // gso rbozivdq gmlhrtvp GRP_0
2141 blank call blank call rbozivdq gmlhrtvp GRP_0
2533 reset passwords for qwsjptlo hnlasbed using pa... the goaxzsql qpjnbgsa GRP_17
2554 reset passwords for bxeagsmt zrwdgsco using pa... the bxeagsmt zrwdgsco GRP_17
2683 ticket update ticket update pbhmwqtz wqlbudjx GRP_0
2714 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
2720 german call german call ayrhcfxi zartupsw GRP_0
2789 blank call blank call pwkrlqbc zslqfmka GRP_0
2875 blank call blank call pwkrlqbc zslqfmka GRP_0
2876 blank call blank call pwkrlqbc zslqfmka GRP_0
3085 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
3219 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
3619 call came and got disconnected call came and got disconnected rbozivdq gmlhrtvp GRP_0
3637 blank call blank call fumkcsji sarmtlhy GRP_0
3647 答复: 答复: order products online problem \r\n\r\nreceived from: fkdazsmi.yecbrofv@gmail... fkdazsmi yecbrofv GRP_0
3693 reset passwords for mvhcoqed konjdmwq using pa... the mvhcoqed konjdmwq GRP_17
3908 vpn not working- vpn.company.com link is givi... vpn not working- vpn.company.com link is givi... kailyenh zfyvkopr GRP_0
4094 job Job_2883 failed in job_scheduler at: 09/18... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
4229 not able to access -inq industrial (-inq.indus... \r\n\r\nreceived from: muqdlobv.qflsdahg@gmail... muqdlobv qflsdahg GRP_0
4273 blank call blank call pwkrlqbc zslqfmka GRP_0
4303 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
4361 account locked in ad account locked in ad gvsabjhq cgwsbiep GRP_0
4495 job SID_37hoti failed in job_scheduler at: 09/... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_5
4530 blank call blank call fumkcsji sarmtlhy GRP_0
4550 call disconnected due to vpn disconnection call disconnected due to vpn disconnection rbozivdq gmlhrtvp GRP_0
4704 private address fields are enabled on employee... disable private address fields, new & edit but... tavsikpl dcrkwuny GRP_15
4881 install company barcode für ewew8323504 \vzqo... install company barcode für ewew8323504 \vzqo... vzqomdgt jwoqbuml GRP_24
4984 reset passwords for cubdsrml znewqgop using pa... the cubdsrml znewqgop GRP_17
4991 reset passwords for davidthd robankm using pas... the zelunfcq yimdwjrp GRP_17
5212 blank call blank call fumkcsji sarmtlhy GRP_0
5226 blank call blank call olckhmvx pcqobjnd GRP_0
5317 reset passwords for bxeagsmt zrwdgsco using pa... the bxeagsmt zrwdgsco GRP_17
5488 job SID_38hotf failed in job_scheduler at: 09/... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
5521 blank call //gso blank call //gso rbozivdq gmlhrtvp GRP_0
5708 reset passwords for bxeagsmt zrwdgsco using pa... the bxeagsmt zrwdgsco GRP_17
5884 reset passwords for bxeagsmt zrwdgsco using pa... the bxeagsmt zrwdgsco GRP_17
5928 ticket update on inplant_855239 ticket update on inplant_855239 fumkcsji sarmtlhy GRP_0
5945 blank call //gso blank call //gso rbozivdq gmlhrtvp GRP_0
6058 reset passwords for bxeagsmt zrwdgsco using pa... the bxeagsmt zrwdgsco GRP_17
6130 job Job_749 failed in job_scheduler at: 08/27/... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6141 job Job_1989 failed in job_scheduler at: 08/27... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_6
6252 job Job_3028 failed in job_scheduler at: 08/26... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6260 job Job_3028 failed in job_scheduler at: 08/25... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6265 job pp_EU_tool_netch_ap1 failed in job_schedul... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6321 job Job_1314 failed in job_scheduler at: 08/25... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_60
6323 job Job_1314 failed in job_scheduler at: 08/25... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_60
6340 probleme mit erpgui \vsdtxwry ngkcdjye probleme mit erpgui \vsdtxwry ngkcdjye vsdtxwry ngkcdjye GRP_24
6411 svc-now ticket found... doing nothing received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_60
6412 svc-now ticket found... doing nothing received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_60
6471 job SID_41arc2 failed in job_scheduler at: 08/... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6485 job SID_31arc2 failed in job_scheduler at: 08/... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6521 job Job_3028 failed in job_scheduler at: 08/24... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6522 job Job_3028 failed in job_scheduler at: 08/24... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6523 job Job_3028 failed in job_scheduler at: 08/24... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6524 job Job_3028 failed in job_scheduler at: 08/24... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6603 account unlock account unlock jusenflm sufbehom GRP_0
6659 job Job_3028 failed in job_scheduler at: 08/23... received from: monitoring_tool@company.com\r\n... bpctwhsn kzqsbmtp GRP_8
6739 blank call // gso blank call // gso rbozivdq gmlhrtvp GRP_0
6819 reset passwords for wvdxnkhf jirecvta using pa... the wvdxnkhf jirecvta GRP_17
6942 call came and got disconnected call came and got disconnected rbozivdq gmlhrtvp GRP_0
6992 probleme mit erpgui \tmqfjard qzhgdoua probleme mit erpgui \tmqfjard qzhgdoua tmqfjard qzhgdoua GRP_24
7034 blank call blank call fumkcsji sarmtlhy GRP_0
7132 reset passwords for ezrsdgfc hofgvwel using pa... the ezrsdgfc hofgvwel GRP_17
7459 account locked in ad account locked in ad upiyobvj lwohuizr GRP_0
7756 german call german call rbozivdq gmlhrtvp GRP_0
7772 blank call // loud noise blank call // loud noise rbozivdq gmlhrtvp GRP_0
7836 probleme mit erpgui \tmqfjard qzhgdoua probleme mit erpgui \tmqfjard qzhgdoua tmqfjard qzhgdoua GRP_24
8051 issue on pricing in distributor_tool we have agreed price with many of the distribu... hbmwlprq ilfvyodx GRP_21
8093 reset passwords for prgthyuulla ramdntythanjes... the boirqctx bkijgqry GRP_17
8347 blank call // loud noise blank call // loud noise rbozivdq gmlhrtvp GRP_0
8405 unable to launch outlook unable to launch outlook wjtzrmqc ikqpbflg GRP_0
In [ ]:
data_df[data_df.duplicated()].count()
Out[ ]:
Short description    83
Description          83
Caller               83
Assignment group     83
dtype: int64

There are 84 duplicate tickets having all 4 columns same.

Word cloud on Description column

In [ ]:
desc = " ".join(des for des in data_df.Description)

wc_desc = WordCloud(background_color='white', max_words=200, width=400, height=400,random_state=10).generate(desc)
plt.figure(figsize=(10,10))
plt.imshow(wc_desc)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f95eaf611d0>

Word cloud on Short description column

In [ ]:
sh_desc = " ".join(sh_des for sh_des in data_df['Short description'])

wc_sh_desc = WordCloud(background_color='white', max_words=200, width=400, height=400,random_state=10).generate(sh_desc)
plt.figure(figsize=(10,10))
plt.imshow(wc_sh_desc)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f95eaf6d860>

Group wise word cloud

In [ ]:
plt.figure(figsize=(20,20))

for index, i in enumerate(data_df['Assignment group'].unique()):
  s = str(i)
  i = str(data_df[data_df['Assignment group']==s].Description)
  i = WordCloud(background_color='white', max_words=200, width=400, height=400,random_state=10).generate(i)
  c = index+1
  plt.subplot(9,9,c)
  plt.imshow(i)
  plt.title(s)

Data Preprocessing

Remove duplicates

In [ ]:
data_df.drop_duplicates(inplace=True)
data_df.reset_index(drop=True,inplace=True)

Remove "Reported by emailid" words

In [ ]:
#replacing email ids using caller column

import math
df1=[]
str1="";
validstring=1;
for i in data_df.index:
  validstring = 1;
  str1 = data_df.iloc[i].Description
  if str1 == '' or pd.isnull(str1):
      validstring=0;
  if validstring != 0:
    a = data_df.iloc[i].Caller.split()
    tp = a[0] + '.' + a[1] + '@gmail.com'
    if re.search( tp, data_df.iloc[i].Description ):
      str1 = data_df.iloc[i].Description.replace(tp,'')
      #print(i ," replaced")
  df1.append(str1)
tp = pd.DataFrame(df1,columns=["Description"])
In [ ]:
#replacing 'received from:' string

df2=[]
validstring=1;
for i in tp.index:
  validstring=1;
  str1 = data_df.iloc[i].Description
  testString = 'received from:'
  if str1 == '' or pd.isnull(str1):
      validstring=0;
  if validstring != 0:
    if re.search( testString, tp.iloc[i].Description ):
      str1 = tp.iloc[i].Description.replace(testString,'')
      #print(i ," replaced")
  df2.append(str1)
tp2 = pd.DataFrame(df2,columns=["Description"])
In [ ]:
#Remove all remaining email ids
for i in tp2.index:
  tp2.iloc[i].Description = tp2.iloc[i].Description.replace('\n', ' ').replace('\r', '')
  tp2.iloc[i].Description = re.sub(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", '', tp2.iloc[i].Description)
In [ ]:
STOPWORDS = set(stopwords.words('english'))
esc2 = " ".join(des for des in tp2.Description)

wc_desc = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(esc2)
plt.figure(figsize=(10,10))
plt.imshow(wc_desc)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f95ddb0c4e0>
In [ ]:
copy_df = data_df
copy_df["Description"] = df2
copy_df.head()
data_df = copy_df
In [ ]:
data_df.head()
Out[ ]:
Short description Description Caller Assignment group
0 login issue -verified user details.(employee# & manager na... spxjnwir pjlcoqds GRP_0
1 outlook \r\n\r\n \r\n\r\nhello team,\r\n\r\nmy meeting... hmjdrvpb komuaywn GRP_0
2 cant log in to vpn \r\n\r\n \r\n\r\nhi\r\n\r\ni cannot log on to ... eylqgodm ybqkwiam GRP_0
3 unable to access hr_tool page unable to access hr_tool page xbkucsvz gcpydteq GRP_0
4 skype error skype error owlgqjme qhcozdfx GRP_0

Data normalization. Acronyms handling

In [ ]:
contradictions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}
In [ ]:
print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

The below method is used to clean the incoming text. The text is first converted to lower case,then contradictions are normalized and porter stemming is applied. After that the punctuations, special characters and stop words are removed.We also have some common stop words found within the dataset which we have removed as well.

In [ ]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z]')
disclaimers = ['select the following link to view the disclaimer in an alternate language.', 'this communication (including any accompanying documents) is intended only for the sole use of the person(s) to whom it is addressed and may contain information that is privileged,confidential and exempt from disclosure. any unauthorised reading,dissemination ,distribution,duplication of this communication by someone other than the intended recipient is strictly prohibited. if your receipt of this communication is in error,please notify the sender and destrtgoy the original communication immediately','please do not print this email unless it is absolutely necessary. spread environmental awareness','this mailbox is not monitored.please call support at the phone number in this communication for any questions you may have']
def clean_text(text, remove_stopwords=True):
  STOPWORDS = set(stopwords.words('english'))
  porter = PorterStemmer()

  # Convert words to lower case
  text = text.lower()

  # Remove all disclaimers
  for i in disclaimers:
        text = text.replace(i,'')

  #Remove numbers
  text = re.sub('[0-9]', ' ', text)
  if True:
    text = text.split()
    
    if not text:
      return ' '
    new_text = []

    #Remove contradictions and punctuations
    for word in text:
      if word in contradictions:
        new_text.append(porter.stem(contradictions[word]))
      elif word in string.punctuation:
        new_text.append("")
      else:
        new_text.append(porter.stem(word))


      text = " ".join(new_text)
      text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
      text = BAD_SYMBOLS_RE.sub(' ', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.

      #Remove english and custom stop words
      if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        newstop_words = ['yes','no','na','mii','hii','hello','hi','help','please','receiv','received','dear','company','from','sent','to','subject','mailto','email','unabl','need','pleas','issu','com','compani','kennametal.com','http','widia','regards','see','phone','thanks','thankyou','bitte']
        text = [w for w in text if not w in newstop_words]
        text = " ".join(w for w in text if not w in stops)
  return text
In [ ]:
data_df['Description'] = data_df['Description'].apply(clean_text)
data_df['Short description'] = data_df['Short description'].apply(clean_text)
In [ ]:
data_df.isnull().any()
Out[ ]:
Short description    False
Description          False
Caller               False
Assignment group     False
dtype: bool
In [ ]:
data_df.head()
Out[ ]:
Short description Description Caller Assignment group
0 login verifi user details employee manag name check ... spxjnwir pjlcoqds GRP_0
1 outlook team meetings skyp meet etc appear outlook cal... hmjdrvpb komuaywn GRP_0
2 cant log vpn cannot log vpn best eylqgodm ybqkwiam GRP_0
3 access hr tool page access hr tool page xbkucsvz gcpydteq GRP_0
4 skype error skype error owlgqjme qhcozdfx GRP_0

Word cloud on description column after data clensing

In [ ]:
STOPWORDS = set(stopwords.words('english'))
In [ ]:
esc3 = " ".join(des for des in data_df.Description)

wc_desc = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(esc3)
plt.figure(figsize=(10,10))
plt.imshow(wc_desc)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f95dda60550>

Word cloud on Short description column after data clensing

In [ ]:
sh_desc2 = " ".join(sh_des for sh_des in data_df['Short description'])

wc_sh_desc = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(sh_desc2)
plt.figure(figsize=(10,10))
plt.imshow(wc_sh_desc)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f95dd854b70>

Group wise word cloud

In [ ]:
plt.figure(figsize=(20,20))

for index, i in enumerate(data_df['Assignment group'].unique()):
  s = str(i)
  i = str(data_df[data_df['Assignment group']==s].Description)
  i = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(i)
  c = index+1
  plt.subplot(9,9,c)
  plt.imshow(i)
  plt.title(s)
In [ ]:
data_df.to_csv("all_74_groups_multilingual.csv")

Dealing with Languages other than English

In [ ]:
pip install fasttext
Requirement already satisfied: fasttext in /usr/local/lib/python3.6/dist-packages (0.9.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from fasttext) (1.18.5)
Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.6/dist-packages (from fasttext) (2.5.0)
Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from fasttext) (47.3.1)
In [ ]:
limodel = '/content/drive/My Drive/lid.176.ftz'
import fasttext
lid_model = fasttext.load_model(limodel) 
def predict_lang(model,texts): return model.predict(texts,k=1)
otherLanguagesDf = pd.DataFrame()
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
In [ ]:
for index in data_df.index:
  prediction = predict_lang(lid_model,data_df.iloc[index]["Description"])
  label = prediction[0][0].split("__label__")[1]
  confidence = prediction[1][0]
  if label != "en" and confidence > 0.50:
    print (index)
    otherLanguagesDf = otherLanguagesDf.append({'index':index ,'label': label,"confidence": confidence, "text":data_df.iloc[index]["Description"]}, ignore_index=True)
plt.title("Contribution of other languages in the dataset")
sns.countplot(otherLanguagesDf['label'])
20
26
62
123
220
222
249
253
263
268
270
282
300
306
308
333
337
347
355
361
363
366
368
425
439
448
451
453
461
464
479
504
538
551
566
568
570
574
578
582
585
591
593
601
604
605
606
617
620
644
646
739
745
749
755
758
759
767
771
773
776
778
857
858
873
878
887
900
901
906
907
918
936
961
965
987
991
1005
1010
1012
1030
1032
1038
1041
1042
1047
1051
1053
1106
1116
1126
1135
1138
1141
1147
1148
1149
1158
1161
1167
1202
1236
1239
1240
1264
1265
1266
1267
1269
1275
1278
1280
1281
1336
1396
1433
1437
1443
1504
1530
1535
1540
1542
1544
1557
1600
1609
1614
1622
1633
1640
1648
1649
1656
1657
1659
1666
1669
1670
1728
1757
1759
1760
1762
1763
1765
1775
1776
1780
1781
1820
1857
1858
1862
1875
1889
1904
1912
1965
1994
2006
2024
2027
2033
2051
2057
2058
2068
2071
2072
2073
2074
2080
2087
2105
2110
2130
2141
2142
2148
2157
2166
2167
2168
2172
2173
2178
2197
2198
2216
2246
2268
2271
2272
2280
2287
2293
2368
2369
2383
2387
2389
2391
2392
2393
2401
2406
2408
2424
2425
2440
2486
2490
2493
2559
2595
2680
2692
2703
2708
2709
2710
2711
2719
2771
2778
2780
2782
2784
2795
2816
2817
2818
2833
2835
2857
2912
2914
2924
2936
2938
2946
2950
2953
2964
2971
2976
3041
3048
3049
3072
3085
3139
3168
3200
3211
3212
3236
3257
3258
3281
3297
3353
3358
3400
3407
3409
3412
3414
3422
3426
3435
3452
3465
3511
3520
3525
3541
3542
3546
3548
3564
3570
3608
3636
3642
3648
3653
3657
3658
3659
3660
3687
3693
3720
3788
3790
3792
3793
3804
3809
3812
3828
3830
3831
3839
3860
3873
3943
3953
3954
3955
3958
3968
3969
3977
4006
4011
4045
4087
4093
4157
4160
4171
4173
4175
4182
4224
4269
4270
4292
4304
4309
4311
4313
4325
4346
4364
4402
4411
4412
4419
4420
4421
4423
4426
4427
4434
4439
4461
4499
4534
4535
4539
4540
4541
4552
4556
4607
4620
4632
4647
4658
4674
4676
4677
4679
4681
4696
4697
4698
4782
4849
4851
4852
4857
4858
4860
4871
4924
4926
4932
4938
4940
4942
4943
4945
4951
4954
4956
4972
5027
5043
5053
5148
5182
5187
5194
5197
5211
5238
5242
5243
5248
5249
5250
5251
5266
5369
5385
5404
5407
5408
5409
5410
5418
5425
5426
5479
5501
5524
5525
5530
5536
5537
5578
5609
5641
5653
5674
5676
5679
5767
5835
5845
5849
5864
5904
5932
5933
5946
5977
5988
5997
5998
5999
6001
6003
6004
6006
6008
6014
6015
6017
6018
6025
6027
6052
6127
6145
6146
6160
6162
6163
6164
6167
6173
6178
6193
6194
6197
6206
6222
6258
6262
6278
6282
6283
6285
6288
6297
6298
6305
6306
6309
6311
6314
6318
6320
6327
6346
6386
6398
6406
6407
6425
6426
6429
6437
6439
6442
6443
6444
6449
6517
6523
6546
6556
6559
6560
6563
6570
6573
6579
6580
6601
6649
6700
6710
6711
6719
6726
6727
6728
6733
6737
6742
6747
6757
6765
6766
6872
7021
7024
7035
7036
7038
7044
7048
7049
7098
7116
7148
7161
7165
7168
7173
7174
7179
7182
7203
7211
7219
7275
7319
7325
7329
7340
7343
7347
7356
7361
7370
7375
7398
7412
7436
7466
7467
7479
7480
7485
7486
7488
7498
7522
7634
7641
7642
7677
7694
7702
7714
7715
7727
7729
7740
7743
7747
7751
7752
7754
7755
7785
7825
7827
7845
7849
7852
7869
7941
7947
7954
7955
7990
8016
8022
8024
8030
8109
8113
8122
8133
8135
8149
8151
8157
8161
8201
8275
8315
8319
8330
8336
8337
8342
8344
8356
8382
8384
8416
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f95dddd9e48>
In [ ]:
otherLanguagesDf['label'].value_counts()
Out[ ]:
de     512
pl      24
ca      19
it      17
fr      12
pt      12
es       5
ie       5
tl       3
hu       3
ceb      2
nl       2
fi       2
jv       2
ru       2
sl       1
sv       1
id       1
kw       1
sw       1
ko       1
jbo      1
ia       1
th       1
Name: label, dtype: int64
In [ ]:
#Total other languages are 7% of whole dataset. We can remove these entries to make our dataset comprise only of English language tickets.
indexesToRemove = otherLanguagesDf["index"]
In [ ]:
data_df = data_df.drop(indexesToRemove)
In [ ]:
data_df.shape
Out[ ]:
(7786, 4)
In [ ]:
data_df.isnull().any()
Out[ ]:
Short description    False
Description          False
Caller               False
Assignment group     False
dtype: bool
In [ ]:
data_df.to_csv("all_74_groups_only_english.csv")

Model building

Merge all columns into a single column

In [ ]:
data_df['MergedColumn'] = data_df[data_df.columns[0:3]].apply(
    lambda x: ' '.join(x.astype(str)),
    axis=1
)
data_df = data_df.drop(['Short description','Description','Caller'],axis=1)
data_df.head()
Out[ ]:
Assignment group MergedColumn
0 GRP_0 login verifi user details employee manag name ...
1 GRP_0 outlook team meetings skyp meet etc appear out...
2 GRP_0 cant log vpn cannot log vpn best eylqgodm ybqk...
3 GRP_0 access hr tool page access hr tool page xbkucs...
4 GRP_0 skype error skype error owlgqjme qhcozdfx

Find the maximum length of the Merged column

In [ ]:
max = 1
maxindex = 0;
lengthsdf = pd.DataFrame()
for ind in data_df.index:
  if len(data_df['MergedColumn'][ind]) > max:
    max = len(data_df['MergedColumn'][ind])
    maxindex = ind
print(max)
6111

Tokenize the Merged column

In [ ]:
max_features = max
maxlen = 200 
embedding_size = 200
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data_df['MergedColumn'])
In [ ]:
X = tokenizer.texts_to_sequences(data_df['MergedColumn'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(data_df['Assignment group'])
y = pd.get_dummies(data_df['Assignment group']).values

Split data into train and test datasets

In [ ]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Extract the GloVe embedding file

In [ ]:
project_path = '/content/drive/My Drive/'
glove_file = project_path + "Copy_of_glove.6B.zip"
#Extract Glove embedding zip file
from zipfile import ZipFile
with ZipFile(glove_file, 'r') as z:
  z.extractall()

Embed each word

In [ ]:
EMBEDDING_FILE = './glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

Find the num_words and create embedding matrix

In [ ]:
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

len(embeddings.values())
Out[ ]:
400000
In [ ]:
num_words
Out[ ]:
15974

Create model skeleton

In [ ]:
model = Sequential()
model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(74, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, None, 200)         3194800   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 200)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_1 (Dense)              (None, 74)                7474      
=================================================================
Total params: 3,322,674
Trainable params: 3,322,674
Non-trainable params: 0
_________________________________________________________________
None

Train the model

In [ ]:
epochs = 30
batch_size = 64 
 
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)
Epoch 1/30
88/88 [==============================] - 61s 698ms/step - loss: 2.5307 - accuracy: 0.4965 - val_loss: 2.0140 - val_accuracy: 0.5602
Epoch 2/30
88/88 [==============================] - 61s 690ms/step - loss: 1.8999 - accuracy: 0.5690 - val_loss: 1.7877 - val_accuracy: 0.5811
Epoch 3/30
88/88 [==============================] - 61s 690ms/step - loss: 1.6535 - accuracy: 0.5913 - val_loss: 1.6215 - val_accuracy: 0.5971
Epoch 4/30
88/88 [==============================] - 61s 695ms/step - loss: 1.4361 - accuracy: 0.6318 - val_loss: 1.5153 - val_accuracy: 0.6067
Epoch 5/30
88/88 [==============================] - 61s 697ms/step - loss: 1.2546 - accuracy: 0.6683 - val_loss: 1.4398 - val_accuracy: 0.6228
Epoch 6/30
88/88 [==============================] - 61s 697ms/step - loss: 1.1124 - accuracy: 0.7013 - val_loss: 1.4128 - val_accuracy: 0.6421
Epoch 7/30
88/88 [==============================] - 61s 693ms/step - loss: 0.9715 - accuracy: 0.7411 - val_loss: 1.3761 - val_accuracy: 0.6404
Epoch 8/30
88/88 [==============================] - 61s 696ms/step - loss: 0.8598 - accuracy: 0.7615 - val_loss: 1.3455 - val_accuracy: 0.6597
Epoch 9/30
88/88 [==============================] - 61s 693ms/step - loss: 0.7506 - accuracy: 0.7954 - val_loss: 1.3582 - val_accuracy: 0.6677
Epoch 10/30
88/88 [==============================] - 62s 700ms/step - loss: 0.6736 - accuracy: 0.8096 - val_loss: 1.3331 - val_accuracy: 0.6774
Epoch 11/30
88/88 [==============================] - 62s 701ms/step - loss: 0.6061 - accuracy: 0.8275 - val_loss: 1.3676 - val_accuracy: 0.6709
Epoch 12/30
88/88 [==============================] - 62s 710ms/step - loss: 0.5396 - accuracy: 0.8475 - val_loss: 1.3492 - val_accuracy: 0.6838
Epoch 13/30
88/88 [==============================] - 61s 694ms/step - loss: 0.4857 - accuracy: 0.8617 - val_loss: 1.3721 - val_accuracy: 0.6806
Epoch 14/30
88/88 [==============================] - 61s 698ms/step - loss: 0.4395 - accuracy: 0.8719 - val_loss: 1.3661 - val_accuracy: 0.6709
Epoch 15/30
88/88 [==============================] - 61s 696ms/step - loss: 0.3877 - accuracy: 0.8878 - val_loss: 1.3860 - val_accuracy: 0.6790
Epoch 16/30
88/88 [==============================] - 61s 698ms/step - loss: 0.3491 - accuracy: 0.8956 - val_loss: 1.4027 - val_accuracy: 0.6790
Epoch 17/30
88/88 [==============================] - 62s 699ms/step - loss: 0.3316 - accuracy: 0.9024 - val_loss: 1.4153 - val_accuracy: 0.6806
Epoch 18/30
88/88 [==============================] - 61s 697ms/step - loss: 0.2982 - accuracy: 0.9104 - val_loss: 1.4259 - val_accuracy: 0.6742
Epoch 19/30
88/88 [==============================] - 61s 698ms/step - loss: 0.2784 - accuracy: 0.9151 - val_loss: 1.4807 - val_accuracy: 0.6838
Epoch 20/30
88/88 [==============================] - 61s 699ms/step - loss: 0.2614 - accuracy: 0.9204 - val_loss: 1.4596 - val_accuracy: 0.6774
Epoch 21/30
88/88 [==============================] - 62s 699ms/step - loss: 0.2460 - accuracy: 0.9245 - val_loss: 1.4813 - val_accuracy: 0.6822
Epoch 22/30
88/88 [==============================] - 62s 699ms/step - loss: 0.2387 - accuracy: 0.9272 - val_loss: 1.4895 - val_accuracy: 0.6790
Epoch 23/30
88/88 [==============================] - 61s 695ms/step - loss: 0.2320 - accuracy: 0.9252 - val_loss: 1.5443 - val_accuracy: 0.6902
Epoch 24/30
88/88 [==============================] - 62s 700ms/step - loss: 0.2194 - accuracy: 0.9304 - val_loss: 1.5629 - val_accuracy: 0.6693
Epoch 25/30
88/88 [==============================] - 61s 693ms/step - loss: 0.2169 - accuracy: 0.9293 - val_loss: 1.5262 - val_accuracy: 0.6661
Epoch 26/30
88/88 [==============================] - 61s 698ms/step - loss: 0.2025 - accuracy: 0.9342 - val_loss: 1.6100 - val_accuracy: 0.6693
Epoch 27/30
88/88 [==============================] - 64s 730ms/step - loss: 0.1988 - accuracy: 0.9343 - val_loss: 1.6439 - val_accuracy: 0.6822
Epoch 28/30
88/88 [==============================] - 61s 699ms/step - loss: 0.1905 - accuracy: 0.9372 - val_loss: 1.6429 - val_accuracy: 0.6790
Epoch 29/30
88/88 [==============================] - 61s 697ms/step - loss: 0.1897 - accuracy: 0.9329 - val_loss: 1.6171 - val_accuracy: 0.6742
Epoch 30/30
88/88 [==============================] - 62s 701ms/step - loss: 0.1795 - accuracy: 0.9399 - val_loss: 1.6602 - val_accuracy: 0.6902

Test the model on test set

In [ ]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))
Accuracy: 67.073172

Plot the accuracy and loss functions

In [ ]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();
In [ ]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

Conclusion - The performance of model on test set is 67.07%. The loss and accuracy curve suggests that this is due to biased nature of dataset.